#!/usr/bin/python # -*- coding: ISO-8859-1 -*- from __future__ import division from __future__ import absolute_import from __future__ import division, print_function, unicode_literals ########################### ### Autor: Sebastian Enger / M.Sc. ### Copyright: Sebastian Enger ### Licence: Commercial / OneTipp ### Version: 1.0.7 - 17-10-2015@23:53 Uhr ### Contact: sebastian.enger@gmail.com ### OneTipp Text Tool in Python ########################### #https://docs.python.org/2/library/configparser.html ######## export PYTHON_EGG_CACHE=/tmp import pprint import os import nltk # import rocksdb # shared library kann aktuell noch nicht gelesen werden import MySQLdb # apt-get install python-mysqldb from sphinxit.core.processor import Search # http://sphinxit.readthedocs.org/en/latest/ from sphinxit.core.helpers import BaseSearchConfig from random import randint from past.builtins import basestring # pip install future import codecs import sys from sumy.parsers.plaintext import PlaintextParser # https://github.com/miso-belica/sumy from sumy.nlp.tokenizers import Tokenizer from sumy.summarizers.lsa import LsaSummarizer as Summarizer from sumy.nlp.stemmers import Stemmer import re from transliterate import translit, get_available_language_codes import libleipzig import pprint import json from textstat.textstat import textstat # https://pypi.python.org/pypi/textstat os.environ['PYTHON_EGG_CACHE'] = '/home/compress/' ###python -m nltk.downloader -d /usr/share/nltk_data all ####python -m nltk.downloader all ###########nltk.download() # nltk.download("punkt") reload(sys) sys.setdefaultencoding('utf-8') noDoubleHash = set() ###re_match = r"[(\?|\.|\!)][(\t|\r|\n|\s|\w){0,}]([A-Za-z0-9]{1,})" # Match: ". WORT" re_match = r"(\?|\.|\!)$" # Match: ". WORT" # lies die Ein und Ausgabedateien inputfile = sys.argv[1] outputfile = sys.argv[2] # http://www.tutorialspoint.com/python/python_command_line_arguments.htm # read file into string text = open(inputfile, 'r').read() #text.decode('utf-8') text = text.decode("utf-8") class SphinxitConfig(BaseSearchConfig): DEBUG = False WITH_META = False WITH_STATUS = False POOL_SIZE = 5 # SQL_ENGINE = 'oursql' SEARCHD_CONNECTION = { 'host': '127.0.0.1', 'port': 9977, } # delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r'] # http://pyrocksdb.readthedocs.org/en/v0.4/tutorial/index.html # https://github.com/sphinxsearch/sphinx/blob/master/api/sphinxapi.py # http://www.tutorialspoint.com/python/python_database_access.htm # mysql = MySQLdb.connect("localhost","root","###########99","onetipp" ) # last working sphinx = MySQLdb.connect( host='127.0.0.1', user='root', passwd='###########99', db='onetipp', port=9977) # sphinxQL cursorSphinx = sphinx.cursor() mysql = MySQLdb.connect( host='127.0.0.1', user='root', passwd='###########99', db='onetipp', port=3306) # Mysql mysql.autocommit(True) cursorMysql = mysql.cursor() def log_warnings(curs): for msg in curs.messages: if msg[0] == MySQLdb.Warning: logging.warn(msg[1]) def deumlaut(s): """ Replaces umlauts with fake-umlauts """ s = s.replace('\xdf', 'ss') s = s.replace('\xfc', 'ü') s = s.replace('\xdc', 'Ü') s = s.replace('\xf6', 'ö') s = s.replace('\xd6', 'Ö') s = s.replace('\xe4', 'ä') s = s.replace('\xc4', 'Ä') #s = s.replace('\xdf', 'ss') #s = s.replace('\xfc', 'ue') #s = s.replace('\xdc', 'Ue') #s = s.replace('\xf6', 'oe') #s = s.replace('\xd6', 'Oe') #s = s.replace('\xe4', 'ae') # s = s.replace('\xc4', 'Ae') return s def summarizeText(s): ## sumy: https://github.com/miso-belica/sumy/tree/dev/sumy/summarizers sentences = nltk.sent_tokenize(s) sentenceCount = len(sentences) randSentenceCount = randint(int((sentenceCount/100)*90)+1, sentenceCount) # randCount = random.randint(iround(float((sentenceCount / 100) * 55)), iround(sentenceCount)) parser = PlaintextParser.from_string(s, Tokenizer("german")) stemmer = Stemmer("german") # summarizer = TextRankSummarizer(stemmer) summarizer = Summarizer(stemmer) summary = summarizer(parser.document, randSentenceCount) returnText = "" #ISO-8859-1 for sentence in summary: returnText += str(sentence) returnText += " " return returnText # Todos: # create a stopword list in German # if a stopword is part of a synonym # give bad minus points def SynRanker(s,t): if not s or not t: return -10 else: 1 if not isinstance(s, basestring) or not isinstance(t, basestring): return -10 else: 1 startVal = float(1.0) lenSyn = len(s) synHasDigits = any(i.isdigit() for i in s) synhasSonder = False delimiters = ['\n', ' ', ',', '.', '?', '!', ':', ';', '\s', '\t', '\r'] re_sonder = r"(\?|\.|\,|\;|\:|\!|\d)" re_space = r"(\t|\r|\n|\s|\w)" firstS = s[0:1] firstT = t[0:1] if s == t: startVal -= -0.95 return -1 else: 1 if lenSyn <= 0: startVal -= -0.99 return -10 else: 1 if lenSyn > 3 and lenSyn < 14: startVal += 0 elif lenSyn <= 3: startVal -= 0.35 else: 1 if (' ' in s) and lenSyn >= 14: startVal -= 0.75 elif (' ' in s) and lenSyn < 14: startVal -= 0.55 elif (' ' not in s) and lenSyn >= 14: startVal -= 0.05 elif (' ' not in s) and lenSyn < 14: startVal += 0.05 else: 1 if re.search(re_space, s) is not None: startVal -= 0.50 else: 1 if re.search(re_sonder, s) is not None: startVal -= 0.075 synhasSonder = True else: 1 if firstS.isupper() and firstT.isupper(): startVal += 0.15 elif firstS.islower() and firstT.islower(): startVal += 0.15 elif firstS.isupper() and not firstT.isupper(): startVal -= 0.25 elif firstS.islower() and not firstT.islower(): startVal -= 0.25 else: 1 #print("Synonym: ", s) #print("
") #print("Length: ", lenSyn) #print("
") # print("Digits: ", synHasDigits) #print("
") #print("Space: ", (' ' in s)) #print("
") #print("Sonderzeichen: ", synhasSonder) #print("
") #print("SynRank: ", startVal) #print("
") #print("---------------------------------------------------
") # later ResultCodes return float(startVal) def iround(x): """iround(number) -> integer Round a number to the nearest integer.""" return int(round(x) - .5) + (x > 0) def getSynLeipzig(sl): #print ("Auto Syn - Leipzig: ", libleipzig.Thesaurus("Auto",10)) retContent = [] retSaveMysql = "W:"+sl if not sl: return retContent elif not isinstance(sl, basestring): return retContent elif len(sl) < 3: return retContent synLeipzig = libleipzig.Thesaurus(sl, 150) if not synLeipzig: return retContent else: for aSyn in synLeipzig: retContent.append(str(aSyn[0])) retSaveMysql += ";S:"+(str(aSyn[0])) if len(retSaveMysql) > 5: raw = json.dumps(retSaveMysql) loggit = "INSERT INTO synonym_leipzig(raw,uid) VALUES(%s, %s)" try: cursorMysql.execute(loggit, (raw, 0)) mysql.commit() except MySQLdb.ProgrammingError: print("Function -getSynLeipzig()- failed: The following mysql query failed:") print(loggit) data = [] return retContent # sent_tokenize_list = sent_tokenize(text) # Summarize the text first and then work on it tSumy = summarizeText(text) tokens = nltk.word_tokenize(tSumy) tokensRaw = nltk.word_tokenize(text) count = -1 changeEveryWord = 8 #Leistungsschutzrecht: 7 Zeichen dürfen genutzt werden, darüber muss geändert werden changeEveryWordFlag = 0 changeEveryWordTemp = 0 #temporary upcount for word in tokens: count += 1 wordTemp = word.encode('ascii', 'ignore') # cursorMysql.execute("SELECT * FROM (namen_table) WHERE name LIKE '%s%%' LIMIT 1;" % (word)) cursorMysql.execute("SELECT * FROM (namen_table) WHERE BINARY `name` = '%s' LIMIT 1;" % (wordTemp)) name_content = cursorMysql.fetchone() #print ("SELECT * FROM (namen_table) WHERE name LIKE '%s' LIMIT 1;" % (word)) #print (name_content) # search_query = Search(indexes=['onetipp_name'], config=SphinxitConfig) # # search_query = search_query.match(word).options( # search_query = search_query.match(word).options( # ranker='proximity_bm25', # max_matches=1, # max_query_time=350, # field_weights={'name': 100, 'gender': -10000, 'language': -10000, 'meaning': -10000}, # ) ###sphinx_result = search_query.ask() # exit(0) # es wurde ein namen gefunden -> kein synonym austauschen if name_content is not None: # print("Token: ", tokens) #print("Count: ", count) #print("
") #print("Tokencount overall: ", len(tokens)) #print("
") # tokens[count] = '' + deumlaut(word) + '' tokens[count] = deumlaut(word) tokensRaw[count] = deumlaut(word) # print "Namen erkannt und nicht getauscht" continue else: 1 if changeEveryWordTemp == (changeEveryWord - 1): changeEveryWordFlag = 0 changeEveryWordTemp = 0 else: 1 if changeEveryWordFlag == 1: changeEveryWordTemp += 1 else: 1 if len(word) >=4 and changeEveryWordFlag == 0: # Versuche zuerst die Leipzig DB anzufordern lstcWord = word[0:1] synDictLeipzig = {} sLeipzigList = getSynLeipzig(word) if sLeipzigList: for wSynL in sLeipzigList: #synDict[SynRanker(wSyn, word)] = wSyn if wSynL not in noDoubleHash: synDictLeipzig[wSynL] = SynRanker(wSynL, word) sortedSynList = [] sortedSynList = sorted(synDictLeipzig.items(), key=lambda x: x[1], reverse=True) firstBestSynHit = str(sortedSynList[0][0]) firstBestSynHitRank = str(sortedSynList[0][1]) # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross if re.search(re_match, tokens[count-1]) is not None: firstBestSynHit.title() if word.endswith('.'): firstBestSynHit += '.' elif word.endswith('?'): firstBestSynHit += '?' elif word.endswith('!'): firstBestSynHit += '!' elif word.endswith(','): firstBestSynHit += ',' elif word.endswith(';'): firstBestSynHit += ';' elif word.endswith(':'): firstBestSynHit += ':' # later: Randomly choose one of the synonyms that have all the highest rating # tokens[count] = '' + deumlaut( # firstBestSynHit) + '' tokens[count] = deumlaut(firstBestSynHit) noDoubleHash.add(firstBestSynHit) tokensRaw[count] = deumlaut(firstBestSynHit) changeEveryWordFlag = 1 changeEveryWordTemp += 1 else: #nutze unsere lokale Synonym Mysql Datenbank search_query_syn = Search(indexes=['onetipp_syn_simple'], config=SphinxitConfig) search_query_syn = search_query_syn.match(word).options( ranker='proximity_bm25', max_matches=1, max_query_time=350, field_weights={'synonyms': 100}, ) sphinx_result_syn = search_query_syn.ask() synID = 0 try: synID = sphinx_result_syn['result']['items'][0].values()[0] if synID > 0: # print "SynDB has been found: ", synID #später finde via sphinx noch mehr synonyme und parse diese alle sql = "SELECT synonyms FROM (synonym_unique_simple) WHERE uid= %s" % (synID) cursorMysql.execute(sql) syn_content = cursorMysql.fetchone() synContent = list(syn_content) synContent = synContent[0].decode(encoding="utf-8", errors="ignore") if syn_content: synwords = synContent.split(";") # print SynDictCalculator(synwords) # http://www.saltycrane.com/blog/2007/09/how-to-sort-python-dictionary-by-keys/ # for key, value in sorted(mydict.iteritems(), key=lambda (k,v): (v,k)): # print "%s: %s" % (key, value) synDict = {} for wSyn in synwords: #synDict[SynRanker(wSyn, word)] = wSyn if wSyn not in noDoubleHash: synDict[wSyn] = SynRanker(wSyn, word) sortedSynList = [] sortedSynList = sorted(synDict.items(), key=lambda x: x[1], reverse=True) firstBestSynHit = str(sortedSynList[0][0]) firstBestSynHitRank = str(sortedSynList[0][1]) # Hat das letzte Wort ein Satzendenzeichen, schreibe das aktuell gleich mal gross if re.search(re_match, tokens[count-1]) is not None: firstBestSynHit.title() if word.endswith('.'): firstBestSynHit += '.' elif word.endswith('?'): firstBestSynHit += '?' elif word.endswith('!'): firstBestSynHit += '!' elif word.endswith(','): firstBestSynHit += ',' elif word.endswith(';'): firstBestSynHit += ';' elif word.endswith(':'): firstBestSynHit += ':' # later: Randomly choose one of the synonyms that have all the highest rating # tokens[count] = '' + deumlaut(firstBestSynHit) + '' tokens[count] = deumlaut(firstBestSynHit) noDoubleHash.add(firstBestSynHit) tokensRaw[count] = deumlaut(firstBestSynHit) changeEveryWordFlag = 1 changeEveryWordTemp += 1 #break except IndexError: 1 # file schreiben outputtext = ' '.join(tokens) outputtextRaw = ' '.join(tokensRaw) readabilityVar = str(textstat.flesch_reading_ease(outputtextRaw)) with codecs.open(outputfile, 'w') as f: f.write(outputtext ) # f.write("Lesbarkeitswert : " + readabilityVar) #f.write("

") #f.write(outputtext) #f.write("

") #f.write("RUSSISCHE TRANSLITERATION: BEISPIEL VERSION") #f.write("

") #f.write(translit(outputtextRaw, 'ru')) f.close() mysql.commit() mysql.close() exit(0) """ The Flesch Reading Ease formula function name - flesch_reading_ease(text) returns the Flesch Reading Ease Score. Following table is helpful to access the ease of readability in a document. 90-100 : Very Easy 80-89 : Easy 70-79 : Fairly Easy 60-69 : Standard 50-59 : Fairly Difficult 30-49 : Difficult 0-29 : Very Confusing """